###########################################################################
## Challenging Encounters and Within-Physician Practice Variability
## Script 2- Build the data base
## Last update: 29-1-23
###########################################################################

## Setup
###########################################################################


# Packages
library(dplyr) 
library(readxl)  
library(haven)   
library(lubridate)

# Directory
wd <- "."    
setwd(wd)                                              
pd <- "./project"

# Function
getmode<-function(v) {
  uniqv<-unique(v)
  uniqv[which.max(tabulate(match(v,uniqv)))]
}
###########################################################################


## Assign Comparison Cases
###########################################################################

# Import primary care Visits
Visits <- readRDS(file.path("R_Mac","R_files","Visits"))
Visits <- Visits[Visits$specialization_code==10,]

# Delete patient-physician-date duplicates (Keep the first visit)
Visits <- Visits[order(Visits$doctor_id,Visits$time),]
Visits <- distinct(Visits,doctor_id,time,.keep_all = T)

# Add columns- Year, Week of the year, day of the week, and serial visit
Visits$weekday <- weekdays(Visits$date)
Visits$yearweek <- week(Visits$date)
Visits$year <- year(Visits$date)
Visits <- mutate(group_by(Visits,date,doctor_id),num_visit=1:n())


######### Difficult Cases
Treatment <- readRDS(file.path(pd,"data","Events")) # Import 
Treatment <- Treatment[order(Treatment$time),] 
Treatment <- distinct(ungroup(Treatment),doctor_id,date,.keep_all=T) # Delete duplicates- two events for the same physician-day
Treatment$Real_Event_ID <- paste("T",1:length(Treatment$time),sep="") # Add Real Event ID for each Event


######### Assign Comparison Cases Alternative 1 (two weeks before and after)
# Start with the difficult cases
Control1 <- Treatment
Control1$weekday <- weekdays(Control1$date)
Control1$yearweek <- week(Control1$date)
Control1$year <- year(Control1$date)
Control1 <- merge(Control1,Visits[,c("doctor_id","time","num_visit")],by=c("doctor_id","time"))

# Construct the comparison cases 
C <- Control1
for(i in 1:2){
  Temp<-C
  Temp$yearweek<-Temp$yearweek+i
  Temp$K<-i
  
  Temp2<-C
  Temp2$yearweek<-Temp2$yearweek-i
  Temp2$K<-(-i)
  
  assign(paste("Placebo",i,sep="_"),Temp)
  assign(paste("Placebo",i,sep="_E"),Temp2)
  
  if(i==1){
    Control1<-rbind(get(paste("Placebo",i,sep="_")),get(paste("Placebo",i,sep="_E")))
  }else{
    Control1<-rbind(Control1,get(paste("Placebo",i,sep="_")),get(paste("Placebo",i,sep="_E")))
    }
}
# Keep cases during our sample period
Control1 <- Control1[Control1$year>2011&Control1$year<2016,]
# Rename time column
colnames(Control1)[2] <- "Original_Time"
# Match the relevant visits
V <- merge(Visits,Control1[,-7],by=c("doctor_id","year","yearweek","weekday","num_visit"))
V <- V[,c("Real_Event_ID","doctor_id","date","time","year","yearweek","weekday","num_visit")]
Control1 <- merge(Control1[,-7],V,by=c("Real_Event_ID","doctor_id","year","yearweek","weekday","num_visit"))
Control1 <- distinct(ungroup(Control1),Real_Event_ID,doctor_id,year,yearweek,weekday,num_visit,.keep_all=T)



######### Assign Comparison Cases Alternative 2 (other years)
# Start with the difficult cases
Control2 <- Treatment
Control2$weekday <- weekdays(Control2$date)
Control2$yearweek <- week(Control2$date)
Control2$year <- year(Control2$date)
Control2 <- merge(Control2,Visits[,c("doctor_id","time","num_visit")],by=c("doctor_id","time"))

# Change years
Placebo1<-Control2
Placebo1$year<-Placebo1$year+1
Placebo2<-Control2
Placebo2$year<-Placebo2$year+2
Placebo3<-Control2
Placebo3$year<-Placebo3$year+3
Placebo11<-Control2
Placebo11$year<-Placebo11$year-1
Placebo22<-Control2
Placebo22$year<-Placebo22$year-2
Placebo33<-Control2
Placebo33$year<-Placebo33$year-3

# Keep cases in our sample period
Control2 <- rbind(Placebo1,Placebo2,Placebo3,Placebo11,Placebo22,Placebo33)
Control2 <- Control2[Control2$year>2011&Control2$year<2016,]
# Rename time column
colnames(Control2)[2]<-"Original_Time"
# Match the relevant visits
V <- merge(Visits,Control2[,-7],by=c("doctor_id","year","yearweek","weekday","num_visit"))
V <- V[,c("Real_Event_ID","doctor_id","date","time","year","yearweek","weekday","num_visit")]
Control2 <- merge(Control2[,-7],V,by=c("Real_Event_ID","doctor_id","year","yearweek","weekday","num_visit"))
Control2 <- distinct(ungroup(Control2),Real_Event_ID,doctor_id,year,yearweek,weekday,num_visit,.keep_all=T)




######### Assign the main definition of Comparison Cases (two weeks before and after on every year- for matching)
# Start with the difficult cases
Control3<-Treatment
Control3$weekday<-weekdays(Control3$date)
Control3$yearweek<-week(Control3$date)
Control3$year<-year(Control3$date)
Control3<-merge(Control3,Visits[,c("doctor_id","time","num_visit")],by=c("doctor_id","time"))
# Create Comparison Cases
C<-Control3
Control3<-NULL
for(y in 2012:2015){
  # Same week
  Temp <- C[C$year!=y,]
  Temp$year <- y
  # week before
  Temp1<-C
  Temp1$year <- y
  Temp1$yearweek <- Temp1$yearweek-1
  # Two weeks before
  Temp2<-C
  Temp2$year <- y
  Temp2$yearweek <- Temp2$yearweek-2
  # week after
  Temp3<-C
  Temp3$year <- y
  Temp3$yearweek <- Temp3$yearweek+1
  # Two weeks after
  Temp4<-C
  Temp4$year <- y
  Temp4$yearweek <- Temp4$yearweek+2
  # Combine
  Control3 <- rbind(Control3,Temp,Temp1,Temp2,Temp3,Temp4)
}

# Rename time column
colnames(Control3)[2] <- "Original_Time"
# Match the relevant visits
V<-merge(Visits,Control3[,-7],by=c("doctor_id","year","yearweek","weekday","num_visit"))
V<-V[,c("Real_Event_ID","doctor_id","date","time","year","yearweek","weekday","num_visit")]
Control3<-merge(Control3[,-7],V,by=c("Real_Event_ID","doctor_id","year","yearweek","weekday","num_visit"))
Control3<-distinct(ungroup(Control3),Real_Event_ID,doctor_id,year,yearweek,weekday,num_visit,.keep_all=T)



###########################################################################


## Create Sample
###########################################################################

# Add ID for each Event (Difficult Case or Comparison Case)
Treatment$Event_ID<-Treatment$Real_Event_ID
Control1$Event_ID<-paste("C1_",1:length(Control1$time),sep="")
Control2$Event_ID<-paste("C2_",1:length(Control2$time),sep="")
Control3$Event_ID<-paste("C3_",1:length(Control3$time),sep="")

# Mark treatment group and comparison by type
Treatment$Treat<-1
Control1$Treat<-0
Control2$Treat<-0
Control3$Treat<-0

Treatment$Control<-0
Control1$Control<-1
Control2$Control<-2
Control3$Control<-3

# Combine Treatment& Comparison events
Events<-rbind(Treatment[,c("doctor_id","date","Treat","Control","Real_Event_ID","Event_ID")],
              Control1[,c("doctor_id","date","Treat","Control","Real_Event_ID","Event_ID")],
              Control2[,c("doctor_id","date","Treat","Control","Real_Event_ID","Event_ID")],
              Control3[,c("doctor_id","date","Treat","Control","Real_Event_ID","Event_ID")])
Events$doctor_id <- as.numeric(Events$doctor_id)

# Create a sample of visits on days of event
Visits$doctor_id <- as.numeric(Visits$doctor_id)
Visits <- merge(Visits,Events,by=c("doctor_id","date"))
Visits <- as.data.frame(Visits)
Visits<-Visits[,c("patient_id","time","date","doctor_id","visit_length",
                  "Treat","Control","num_visit","Event_ID","Real_Event_ID")]

# Mark events
Events <- rbind(as.data.frame(Treatment[,c("doctor_id","time","Event_ID","Real_Event_ID")]),
              as.data.frame(Control1[,c("doctor_id","time","Event_ID","Real_Event_ID")]),
              as.data.frame(Control2[,c("doctor_id","time","Event_ID","Real_Event_ID")]),
              as.data.frame(Control3[,c("doctor_id","time","Event_ID","Real_Event_ID")]))
Events$Event <- 1
Visits <- merge(Visits,Events,by=c("doctor_id","time","Event_ID","Real_Event_ID"),all.x=T)
Visits$Event[is.na(Visits$Event)]<-0

# Visits relative to event 
Visits <- Visits[order(Visits$doctor_id,Visits$time),]
Visits$Event_Num <- 0
Visits$Event_Num[Visits$Event==1] <- Visits$num_visit[Visits$Event==1]
Visits <- mutate(group_by(Visits,Event_ID),Event_Num=max(Event_Num))
Visits$Visits_From_Event <- Visits$num_visit-Visits$Event_Num

# Hour
Visits$Hour<-substr(Visits$time,12,13)
Visits$Minutes<-substr(Visits$time,15,16)
Visits$Time_Fixed<-as.numeric(Visits$Hour)+as.numeric(Visits$Minutes)/60

# Add cancer type
Visits <- merge(Visits,Treatment[,c(10,3)],by=c("Real_Event_ID"))

# Save
Visits <- Visits[order(Visits$doctor_id,Visits$date),]
saveRDS(Visits,file.path(pd,"data","data_1"))
Visits<-NULL
gc()
###########################################################################

gc()

## Create Outcome Variables 
###########################################################################

# Lab tests
sample<-readRDS(file.path(pd,"data","data_1"))[,c("patient_id","date","doctor_id")]
Lab<-readRDS(file.path("R_Mac","R_files","Lab_Referrals"))[,c("patient_id","date","doctor_id","lab_code","lab_desc")]
Lab<-merge(Lab,sample,by=c("patient_id","date","doctor_id"))
gc()
# Mark blood tests
Lab$lab_code[grepl("(B)",Lab$lab_desc,fixed=T)]<-5022
Lab$lab_code[grepl("Blood",Lab$lab_desc,fixed=T)]<-5022
# Mark biopsy tests
Lab$lab_code[grepl("Biopsy",Lab$lab_desc,fixed=T)]<-8300
Lab<-summarise(group_by(Lab,patient_id,date,doctor_id,lab_code),total=n(),lab_desc=getmode(lab_desc))

# Check tests' frequencies
L<-summarise(group_by(Lab,lab_code),
             frequency=n()/length(sample$doctor_id),
             lab_desc=getmode(lab_desc))
L <- L[order(-L$frequency),]
L <- L[L$frequency>0.03,]
L <- L[c(1:5,8:12),] # Drop duplicates of cholesterol
a<-print(xtable(L))
write.table(a,file.path(pd,"outputs","Most_Common_Lab_Tests.txt"))
write.csv(L,file.path(pd,"outputs","Most_Common_Lab_Tests.csv"))

# Mark common tests (5% of the visits or more)
Lab$Blood<-Lab$lab_code==5022
Lab$Urine<-Lab$lab_code==1000
Lab$Cholesterol<-Lab$lab_code==3718|Lab$lab_code==3719|Lab$lab_code==2465
Lab$Triglycerides<-Lab$lab_code==4478
Lab$Phosphatase<-Lab$lab_code==4075
Lab$TSH<-Lab$lab_code==4443
Lab$Alt_GPT<-Lab$lab_code==4460
Lab$VitaminB12<-Lab$lab_code==2607
Lab$AST_GOT<-Lab$lab_code==4450
Lab$Ferritin<-Lab$lab_code==2728

# Mark cancer specific tests
Lab$Biopsy<-grepl("Biopsy",Lab$lab_desc,fixed=T)
Lab$PSA<-Lab$lab_code==4153
Lab$Occult_Blood<-Lab$lab_code==22731

# Summarise
Lab<-summarise(group_by(Lab,patient_id,date,doctor_id),lab_tests=sum(total),
               lab_codes=n(),lab_dups=sum(total>1),
               Alt_GPT=sum(Alt_GPT>0),
               VitaminB12=sum(VitaminB12>0),
               AST_GOT=sum(AST_GOT>0),
               Ferritin=sum(Ferritin>0),
               Blood=sum(Blood>0),
               Urine=sum(Urine>0),
               Cholesterol=sum(Cholesterol>0),
               Triglycerides=sum(Triglycerides>0),
               Phosphatase=sum(Phosphatase>0),
               TSH=sum(TSH>0),
               Biopsy=sum(Biopsy>0),
               PSA=sum(PSA>0),
               Occult_Blood=sum(Occult_Blood>0))
saveRDS(Lab,file.path(pd,"data","lab_tests_data"))

# Remove
Lab <- NULL
gc()

# Add referrals
# sample <- readRDS(file.path(pd,"data","data_1"))[,c("patient_id","date","doctor_id")]
Refs <- readRDS(file.path("R_Mac","R_files","Referrals"))[,c("patient_id","date","doctor_id","ref_code","ref_desc")]
Refs <- merge(Refs,sample,by=c("patient_id","date","doctor_id"))
gc()
Refs<-summarise(group_by(Refs,patient_id,date,doctor_id,ref_code),total=n(),ref_desc=getmode(ref_desc))

# Check referrals frequencies
R<-summarise(group_by(Refs,ref_code),
             frequency=n()/length(sample$doctor_id),
             ref_desc=getmode(ref_desc))
R<-R[!grepl("C",R$ref_code,fixed=T),] # Drop Sepcialists
R<-R[order(-R$frequency),]
R<-R[1:25,]
R$frequency <- R$frequency*100
b<-print(xtable(R))
write.table(b,file.path(pd,"outputs","Most_Common_Imaging_Tests_LATEX.txt"))
write.csv(R,file.path(pd,"outputs","Most_Common_Imaging_Tests_LATEX.csv"))

# Mark common tests
Refs$Dimut <- Refs$ref_code==99939
Refs$Rentgen <- Refs$ref_code==99942
Refs$Ultrasound <- Refs$ref_code==99935
Refs$Density <- Refs$ref_code==99926
Refs$Dufler <- Refs$ref_code==93320
Refs$Heart_Eco <- Refs$ref_code==99884
Refs$ER <- Refs$ref_code==99865
Refs$Emer <- Refs$ref_code==99866
Refs$Mammo <- Refs$ref_code==99936
Refs$EKG <- Refs$ref_code==93000
Refs$Heart_Machon <- Refs$ref_code==99923
Refs$Machon <- Refs$ref_code==99919
Refs$Consult <- grepl("CON",Refs$ref_code,fixed=T)

## Mark colonoscopy referrals (Gatric institute)
Refs$Gastro <- Refs$ref_code==99921

# Summarise
Refs <- summarise(group_by(Refs,date,patient_id,doctor_id),Referrals=sum(total),
                ref_codes=n(),ref_dups=sum(total>0),
                Consult=sum(Consult),
                ER=sum(ER),
                Emer=sum(Emer),
                Dimut=sum(Dimut),
                Ultrasound=sum(Ultrasound),
                Rentgen=sum(Rentgen),
                Density=sum(Density),
                Dufler=sum(Dufler),
                Heart_Eco=sum(Heart_Eco),
                Mammo=sum(Mammo),
                Machon=sum(Machon),
                EKG=sum(EKG),
                Heart_Machon=sum(Heart_Machon),
                Gastro=sum(Gastro))
saveRDS(Refs,file.path(pd,"data","referrals_data"))

# Remove
Refs <- NULL
gc()


# Prescriptions
Pres <- readRDS(file.path("R_Mac","R_files","Prescriptions"))[,c("patient_id","date_pres","doctor_id","drug_desc")]
colnames(Pres)[2] <- "date"
Pres <- distinct(Pres,date,patient_id,doctor_id,drug_desc)
sample <- readRDS(file.path(pd,"data","data_1"))[,c("patient_id","date","doctor_id")]
Pres <- merge(Pres,sample,by=c("patient_id","date","doctor_id"))
Pres$Drug <- 1
Pres <- summarise_all(group_by(Pres[,-c(4)],date,patient_id,doctor_id),max,na.rm=T)
sample <- NULL
gc()
saveRDS(Pres,file.path(pd,"data","prescriptions_data"))
Pres <- NULL
gc()



# Merge everything
Data <- readRDS(file.path(pd,"data","data_1"))

Lab <- readRDS(file.path(pd,"data","lab_tests_data"))
Data1 <- merge(Data[Data$Control==1,],Lab,by=colnames(Lab)[1:3],all.x=T)
Data2 <- merge(Data[Data$Control==2,],Lab,by=colnames(Lab)[1:3],all.x=T)
Data3a <- merge(Data[Data$Control==3&year(Data$date)<2014,],Lab,by=colnames(Lab)[1:3],all.x=T)
Data3b <- merge(Data[Data$Control==3&year(Data$date)==2014,],Lab,by=colnames(Lab)[1:3],all.x=T)
Data3c <- merge(Data[Data$Control==3&year(Data$date)==2015,],Lab,by=colnames(Lab)[1:3],all.x=T)
Data0 <- merge(Data[Data$Control==0,],Lab,by=colnames(Lab)[1:3],all.x=T)
Lab <- NULL
Data <- NULL
gc()

Ref <- readRDS(file.path(pd,"data","referrals_data"))
Data1 <- merge(Data1,Ref,by=colnames(Ref)[1:3],all.x=T)
Data2 <- merge(Data2,Ref,by=colnames(Ref)[1:3],all.x=T)
Data3a <- merge(Data3a,Ref,by=colnames(Ref)[1:3],all.x=T)
Data3b <- merge(Data3b,Ref,by=colnames(Ref)[1:3],all.x=T)
Data3c <- merge(Data3c,Ref,by=colnames(Ref)[1:3],all.x=T)
Data0 <- merge(Data0,Ref,by=colnames(Ref)[1:3],all.x=T)
Ref <- NULL
gc()


Pres <- readRDS(file.path(pd,"data","prescriptions_data"))
Data1 <- merge(Data1,Pres,by=colnames(Pres)[1:3],all.x=T)
Data2 <- merge(Data2,Pres,by=colnames(Pres)[1:3],all.x=T)
Data3a <- merge(Data3a,Pres,by=colnames(Pres)[1:3],all.x=T)
Data3b <- merge(Data3b,Pres,by=colnames(Pres)[1:3],all.x=T)
Data3c <- merge(Data3c,Pres,by=colnames(Pres)[1:3],all.x=T)
Data0 <- merge(Data0,Pres,by=colnames(Pres)[1:3],all.x=T)
Pres <- NULL
gc()

Data <- rbind(Data0,Data1,Data2,Data3a,Data3b,Data3c)
rm(Data0,Data1,Data2,Data3a,Data3b,Data3c)
gc()

# Mark zeros
vars <- c("lab_tests","lab_dups","lab_codes","Urine","Blood","Cholesterol","Occult_Blood",
          "Referrals","ref_codes","ref_dups","Dimut","Rentgen","Ultrasound","ER",
          "Emer","Consult","Mammo","Density","Dufler","Heart_Eco","Gastro",
          "Biopsy","PSA","EKG","Heart_Machon","Triglycerides","Phosphatase","TSH",
          "Alt_GPT","VitaminB12","AST_GOT","Ferritin","Drug","Machon")
for(v in vars){
  Data[[v]][abs(Data[[v]])==Inf] <- 0
  Data[[v]][is.na(Data[[v]])] <- 0
}


# Add timing outcomes
Data <- mutate(group_by(Data,date,doctor_id),last=max(num_visit))
Data$Finish <- Data$last==Data$num_visit
Data<-mutate(group_by(Data,doctor_id,date),
             Finish_The_Day_Time=max(Time_Fixed,na.rm=T),
             Start_The_Day_Time=min(Time_Fixed,na.rm=T),
             Breaks_During_The_Day=sum(is.na(visit_length)))
Data$Day_Hours <- Data$Finish_The_Day_Time-Data$Start_The_Day_Time

# Main outcomes
Data$Lab_Test<-(Data$Blood+Data$Cholesterol+Data$Triglycerides+Data$Phosphatase+Data$Alt_GPT)>0
Data$Screening_Test<-(Data$Machon+Data$Heart_Machon+Data$Dimut+Data$Ultrasound+Data$Rentgen)>0               
Data$Diagnostic_Test<-(Data$Lab_Test+Data$Screening_Test)>0

# Save
saveRDS(Data,file.path(pd,"data","data_2"))
###########################################################################

## Build the visit level data
###########################################################################

# Import Data
Data <- readRDS(file.path(pd,"data","data_2"))

# ADD Time Variables
Data$Yearweek <- week(Data$time)
Data$Weekday <- weekdays(Data$time)
Data$Monthday <- substr(Data$time,9,10)

# Add Patient Details
Pat <- readRDS(file.path("R_Mac","R_files","Patients"))
Pat <- Pat[,c(1,2,3,4,10,17:31)]
gc()
Data <- merge(Data,Pat,by="patient_id")
Data$year <- year(Data$date)
Data$month <- month(Data$date)
Data$age <- Data$year-Data$birth_date

# Chronic conditions
Data$TIA <- difftime(as.Date(Data$date_from_tia), Data$time) < 0
Data$Diabetic <- difftime(as.Date(Data$date_from_diabetic), Data$time) < 0
Data$CVD <- difftime(as.Date(Data$date_from_cvd), Data$time) < 0
Data$Hashmana <- difftime(as.Date(Data$date_from_hashmana), Data$time) < 0
Data$Cancer_Old <- difftime(as.Date(Data$date_from_cancer), Data$time) < 0
Data$Blood_pressure <- difftime(as.Date(Data$date_from_bloodpresure), Data$time) < 0
Data$Dializa <- difftime(as.Date(Data$date_from_dializa), Data$time) < 0
Data$COPD <- difftime(as.Date(Data$date_from_copd), Data$time) < 0
Data$Osteo <- difftime(as.Date(Data$date_from_osteo), Data$time) < 0
Data$CHF <- difftime(as.Date(Data$date_from_chf), Data$time) < 0
Data$CKD <- difftime(as.Date(Data$date_from_ckd), Data$time) < 0
Data$Fertility <- difftime(as.Date(Data$date_from_fertility), Data$time) < 0
Data$Cardio <- difftime(as.Date(Data$date_from_cardio), Data$time) < 0

Data$TIA[year(Data$date_from_tia)==1800] <- 0
Data$Diabetic[year(Data$date_from_diabetic)==1800] <- 0
Data$CVD[year(Data$date_from_cvd)==1800] <- 0
Data$Hashmana[year(Data$date_from_hashmana)==1800] <- 0
Data$Cancer_Old[year(Data$date_from_cancer)==1800] <- 0
Data$Blood_pressure[year(Data$date_from_bloodpresure)==1800] <- 0
Data$Dializa[year(Data$date_from_dializa)==1800] <- 0
Data$COPD[year(Data$date_from_copd)==1800] <- 0
Data$Osteo[year(Data$date_from_osteo)==1800] <- 0
Data$CHF[year(Data$date_from_chf)==1800] <- 0
Data$CKD[year(Data$date_from_ckd)==1800] <- 0
Data$Fertility[year(Data$date_from_fertility)==1800] <- 0
Data$Cardio[year(Data$date_from_cardio)==1800] <- 0

# Recent chronic conditions
Data$TIA_Recent <- difftime(as.Date(Data$date_from_tia),Data$time)<0 & difftime(as.Date(Data$date_from_tia),Data$time)>(-181)
Data$Diabetic_Recent <- difftime(as.Date(Data$date_from_diabetic),Data$time)<0 & difftime(as.Date(Data$date_from_diabetic),Data$time)>(-181)
Data$CVD_Recent <- difftime(as.Date(Data$date_from_cvd),Data$time)<0 & difftime(as.Date(Data$date_from_cvd),Data$time)>(-181)
Data$Hashmana_Recent <- difftime(as.Date(Data$date_from_hashmana),Data$time)<0 & difftime(as.Date(Data$date_from_hashmana),Data$time)>(-181)
Data$Cancer_Old_Recent <- difftime(as.Date(Data$date_from_cancer),Data$time)<0 & difftime(as.Date(Data$date_from_cancer),Data$time)>(-181)
Data$Blood_pressure_Recent <- difftime(as.Date(Data$date_from_bloodpresure),Data$time)<0 & difftime(as.Date(Data$date_from_bloodpresure),Data$time)>(-181)
Data$Dializa_Recent <- difftime(as.Date(Data$date_from_dializa),Data$time)<0 & difftime(as.Date(Data$date_from_dializa),Data$time)>(-181)
Data$COPD_Recent <- difftime(as.Date(Data$date_from_copd),Data$time)<0 & difftime(as.Date(Data$date_from_copd),Data$time)>(-181)
Data$Osteo_Recent <- difftime(as.Date(Data$date_from_osteo),Data$time)<0 & difftime(as.Date(Data$date_from_osteo),Data$time)>(-181)
Data$CHF_Recent <- difftime(as.Date(Data$date_from_chf),Data$time)<0 & difftime(as.Date(Data$date_from_chf),Data$time)>(-181)
Data$CKD_Recent <- difftime(as.Date(Data$date_from_ckd),Data$time)<0 & difftime(as.Date(Data$date_from_ckd),Data$time)>(-181)
Data$Fertility_Recent <- difftime(as.Date(Data$date_from_fertility),Data$time)<0 & difftime(as.Date(Data$date_from_fertility),Data$time)>(-181)
Data$Cardio_Recent <- difftime(as.Date(Data$date_from_cardio),Data$time)<0 & difftime(as.Date(Data$date_from_cardio),Data$time)>(-181)


# 2 visits bins (0 is still a single visit, which is the event):
Data$Bins_2_Visits_From_Event <- floor(Data$Visits_From_Event/2+0.7)
Data$Bins_2_Visits_From_Event[Data$Visits_From_Event<0] <- floor(Data$Visits_From_Event[Data$Visits_From_Event<0]/2)

# Dummies for visits relative to event
Data$BI_Pre_9 <- 0
Data$BI_Pre_9[Data$Bins_2_Visits_From_Event == -9] <- 1
Data$BI_Pre_8 <- 0
Data$BI_Pre_8[Data$Bins_2_Visits_From_Event == -8] <- 1
Data$BI_Pre_7 <- 0
Data$BI_Pre_7[Data$Bins_2_Visits_From_Event == -7] <- 1
Data$BI_Pre_6 <- 0
Data$BI_Pre_6[Data$Bins_2_Visits_From_Event == -6] <- 1
Data$BI_Pre_5 <- 0
Data$BI_Pre_5[Data$Bins_2_Visits_From_Event == -5] <- 1
Data$BI_Pre_4 <- 0
Data$BI_Pre_4[Data$Bins_2_Visits_From_Event == -4] <- 1
Data$BI_Pre_3 <- 0
Data$BI_Pre_3[Data$Bins_2_Visits_From_Event == -3] <- 1
Data$BI_Pre_2 <- 0
Data$BI_Pre_2[Data$Bins_2_Visits_From_Event == -2] <- 1
Data$BI_Pre_1 <- 0
Data$BI_Pre_1[Data$Bins_2_Visits_From_Event == -1] <- 1
Data$BI_0 <- 0
Data$BI_0[Data$Bins_2_Visits_From_Event == 0] <- 1
Data$BI_1 <- 0
Data$BI_1[Data$Bins_2_Visits_From_Event == 1] <- 1
Data$BI_2 <- 0
Data$BI_2[Data$Bins_2_Visits_From_Event == 2] <- 1
Data$BI_3 <- 0
Data$BI_3[Data$Bins_2_Visits_From_Event == 3] <- 1
Data$BI_4 <- 0
Data$BI_4[Data$Bins_2_Visits_From_Event == 4] <- 1
Data$BI_5 <- 0
Data$BI_5[Data$Bins_2_Visits_From_Event == 5] <- 1
Data$BI_6 <- 0
Data$BI_6[Data$Bins_2_Visits_From_Event == 6] <- 1
Data$BI_7 <- 0
Data$BI_7[Data$Bins_2_Visits_From_Event == 7] <- 1
Data$BI_8 <- 0
Data$BI_8[Data$Bins_2_Visits_From_Event == 8] <- 1
Data$BI_9 <- 0
Data$BI_9[Data$Bins_2_Visits_From_Event == 9] <- 1

# Interacted with the treatment indicator
Data$TBI_9 <- Data$Treat * Data$BI_9
Data$TBI_8 <- Data$Treat * Data$BI_8
Data$TBI_7 <- Data$Treat * Data$BI_7
Data$TBI_6 <- Data$Treat * Data$BI_6
Data$TBI_5 <- Data$Treat * Data$BI_5
Data$TBI_4 <- Data$Treat * Data$BI_4
Data$TBI_3 <- Data$Treat * Data$BI_3
Data$TBI_2 <- Data$Treat * Data$BI_2
Data$TBI_1 <- Data$Treat * Data$BI_1
Data$TBI_0 <- Data$Treat * Data$BI_0
Data$TBI_Pre_9 <- Data$Treat * Data$BI_Pre_9
Data$TBI_Pre_8 <- Data$Treat * Data$BI_Pre_8
Data$TBI_Pre_7 <- Data$Treat * Data$BI_Pre_7
Data$TBI_Pre_6 <- Data$Treat * Data$BI_Pre_6
Data$TBI_Pre_5 <- Data$Treat * Data$BI_Pre_5
Data$TBI_Pre_4 <- Data$Treat * Data$BI_Pre_4
Data$TBI_Pre_3 <- Data$Treat * Data$BI_Pre_3
Data$TBI_Pre_2 <- Data$Treat * Data$BI_Pre_2
Data$TBI_Pre_1 <- Data$Treat * Data$BI_Pre_1

# Save
saveRDS(Data,file.path(pd,"data","data_3"))

###########################################################################

## Add characteristics 
###########################################################################
# Data <- readRDS(file.path(pd,"data","data_3"))

# Restrict the sample to the relevant period
Data$year <- year(Data$date)
Data$month <- month(Data$date)
Data <- Data[Data$year!=2012|Data$month>6,]

# Doctor details
doc <- readRDS(file.path("R_Mac","R_files","Doctor_Characteristics"))
doc <- doc[,c(1,6,10:13)]
Data<-merge(Data,doc,by="doctor_id")

# List countries by frequency
a <- summarise(group_by(Data,country),total=n())
a <- a[order(-a$total),]
aa <- summarise(group_by(Data,doc_country_code),total=n())
aa <- aa[order(-aa$total),]

# Mark Israeli doctors
Data$Israeli_Doc <- as.numeric(Data$doc_country_code==4)

# Add Event's patient's characteristics
Data$Event_Treat <- Data$Event*Data$Treat
Data$Male_Event <- (Data$gender==1&Data$Event_Treat==1)
Data$Above_62 <- (Data$year-Data$birth_date)>62& Data$Event_Treat==1
Data <- mutate(group_by(Data,Real_Event_ID)
             ,Male_Event=max(Male_Event)
            ,Above_62_Event=max(Above_62,na.rm=T))

# Add death data
Cancer <- read_xlsx("raw_data/CancerStatus.xlsx")
Cancer <- Cancer[,c(1,2)]
colnames(Cancer) <- c("patient_id","Death")
Cancer$Death <- as.Date(Cancer$Death)
keep <- Data[Data$Event_Treat==1,]
keep1 <- merge(keep[,c("patient_id","Real_Event_ID","date","Cancer")],Cancer,by="patient_id",all.x=T)
keep1$diff <- difftime(keep1$date,keep1$Death,units="days")
keep1$Death_4_Years <- keep1$diff>(-365*4)
aa  <- summarise(group_by(keep1,Cancer),total=n(), Mortality_4=mean(Death_4_Years, na.rm=T))
write.csv(aa,file.path(pd,"outputs","cancer_types_mortality.csv"))
Data <- merge(Data,keep1[,c(2,7)],by="Real_Event_ID")

# Save
saveRDS(Data,file.path(pd,"data","data_final"))

###########################################################################
